trans_date_trans_time - 거래발생시간
first - 이름
last - 성
gender - 성별
street - 주소
city - 도시
state - 주
zip - 우편번호
lat - 위도
long - 경도
city_pop - 인구
job - 생년월일
dob - 직업
import utils
import models
import train
import datasets
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
X_train, y_train = datasets.load_train_dataset()
X_test, y_test = datasets.load_test_dataset()
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1296675 entries, 0 to 1296674 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 trans_date_trans_time 1296675 non-null datetime64[ns] 1 cc_num 1296675 non-null int64 2 merchant 1296675 non-null object 3 category 1296675 non-null object 4 amt 1296675 non-null float64 5 first 1296675 non-null object 6 last 1296675 non-null object 7 gender 1296675 non-null object 8 street 1296675 non-null object 9 city 1296675 non-null object 10 state 1296675 non-null object 11 zip 1296675 non-null int64 12 lat 1296675 non-null float64 13 long 1296675 non-null float64 14 city_pop 1296675 non-null int64 15 job 1296675 non-null object 16 dob 1296675 non-null datetime64[ns] 17 trans_num 1296675 non-null object 18 unix_time 1296675 non-null int64 19 merch_lat 1296675 non-null float64 20 merch_long 1296675 non-null float64 dtypes: datetime64[ns](2), float64(5), int64(4), object(10) memory usage: 217.6+ MB
utils.draw_heatmap(X_train)
# 칼럼 이름 변경
X_train.rename(columns={"trans_date_trans_time": "trans_time"}, inplace=True)
X_test.rename(columns={"trans_date_trans_time": "trans_time"}, inplace=True)
X_train.rename(columns={"category": "merch_cat"}, inplace=True)
X_test.rename(columns={"category": "merch_cat"}, inplace=True)
# 레이블 인코딩
utils.convert_category_to_num_label(X_train, 'gender')
utils.convert_category_to_num_label(X_test, 'gender')
# 시간 칼럼 추가
X_train['hour'] = X_train['trans_time'].dt.hour
X_test['hour'] = X_test['trans_time'].dt.hour
# 요일 칼럼 추가
X_train['day'] = X_train['trans_time'].dt.weekday
X_test['day'] = X_test['trans_time'].dt.weekday
# 월별 칼럼 추가
X_train['month'] = X_train['trans_time'].dt.month
X_test['month'] = X_test['trans_time'].dt.month
# 나이 칼럼 추가
X_train['age'] = 2020 - X_train['dob'].dt.year
X_test['age'] = 2020 - X_test['dob'].dt.year
X_train.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 1296675 entries, 0 to 1296674 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 trans_time 1296675 non-null datetime64[ns] 1 cc_num 1296675 non-null int64 2 merchant 1296675 non-null object 3 merch_cat 1296675 non-null object 4 amt 1296675 non-null float64 5 first 1296675 non-null object 6 last 1296675 non-null object 7 gender 1296675 non-null int64 8 street 1296675 non-null object 9 city 1296675 non-null object 10 state 1296675 non-null object 11 zip 1296675 non-null int64 12 lat 1296675 non-null float64 13 long 1296675 non-null float64 14 city_pop 1296675 non-null int64 15 job 1296675 non-null object 16 dob 1296675 non-null datetime64[ns] 17 trans_num 1296675 non-null object 18 unix_time 1296675 non-null int64 19 merch_lat 1296675 non-null float64 20 merch_long 1296675 non-null float64 21 hour 1296675 non-null int64 22 day 1296675 non-null int64 23 month 1296675 non-null int64 24 age 1296675 non-null int64 dtypes: datetime64[ns](2), float64(5), int64(9), object(9) memory usage: 257.2+ MB
plt.figure(figsize=(30,15))
amt_fraud_df = pd.concat([X_train['amt'], y_train], axis=1)
ax = sns.violinplot(data=amt_fraud_df, x='is_fraud', y='amt', hue='is_fraud',
palette={0: '#46A7DC', 1: "#E75858"})
utils.set_plot_labels(ax,
title=('Transaction amount distribution on fraud occurrence', 40),
ylab=('amount',40))
ax.set_xticklabels(['No Fraud', 'Fraud'], fontsize=40)
ax.legend([])
plt.show()
plt.figure(figsize=(30,15))
amt_fraud_df = pd.concat([X_train['amt'], y_train], axis=1)
ax = sns.violinplot(data=amt_fraud_df.loc[amt_fraud_df['amt']<1700,:], x='is_fraud', y='amt', hue='is_fraud',
palette={0: '#46A7DC', 1: "#E75858"})
utils.set_plot_labels(ax,
title=('Transaction amount distribution on fraud occurrence', 40),
ylab=('amount',40))
ax.set_xticklabels(['No Fraud', 'Fraud'], fontsize=40)
ax.legend([])
plt.show()
plt.figure(figsize=(30,15))
age_fraud_df = pd.concat([X_train['age'], y_train], axis=1)
ax = sns.violinplot(data=age_fraud_df, x='is_fraud', y='age', hue='is_fraud',
palette={0: '#46A7DC', 1: "#E75858"})
utils.set_plot_labels(ax,
title=('Age distribution on fraud occurrence', 40),
ylab=('age',40))
ax.set_xticklabels(['No Fraud', 'Fraud'], fontsize=40)
ax.legend([])
plt.show()
plt.figure(figsize=(30,15))
plt.subplot(1, 2, 1)
ax1 = sns.histplot(x='age',data=X_train.loc[y_train==0,:], common_norm=False, stat='count', multiple='stack', binwidth=5, color='#46A7DC')
ax1 = utils.set_plot_labels(ax1, ('No Fraud Occurrence', 40), ('age',30), ('count',30))
plt.xticks(range(10, 100, 10), size=25)
plt.subplot(1, 2, 2)
ax2 = sns.histplot(x='age',data=X_train.loc[y_train==1,:], common_norm=False, stat='count', multiple='stack', binwidth=5, color="#E75858")
ax2 = utils.set_plot_labels(ax2, ('Fraud Occurrence', 40), ('age',30), ('count',30))
plt.xticks(range(10, 100, 10), size=25)
plt.show()
plt.figure(figsize=(30,15))
age_fraud_df = pd.concat([X_train['hour'], y_train], axis=1)
ax = sns.violinplot(data=age_fraud_df, x='is_fraud', y='hour', hue='is_fraud',
palette={0: '#46A7DC', 1: "#E75858"})
utils.set_plot_labels(ax,
title=('Hour on fraud occurrence', 40),
ylab=('hour',40))
ax.set_xticklabels(['No Fraud', 'Fraud'], fontsize=40)
ax.legend([])
plt.show()
hour_fraud_df = pd.concat([X_train['hour'], y_train], axis=1)
plot = sns.FacetGrid(hour_fraud_df, row='is_fraud', height=8, aspect=2.5, xlim=(0,24))
plot.map(sns.histplot, 'hour', stat='percent', binwidth=0.999, color='#BCDC56')
<seaborn.axisgrid.FacetGrid at 0x1618473a0>
day_fraud_df = pd.concat([X_train['day'], y_train], axis=1)
plot = sns.FacetGrid(day_fraud_df, row='is_fraud', height=8, aspect=2.5, xlim=(0,7))
plot.map(sns.histplot, 'day', stat='percent', binwidth=0.999, color='#8657DC')
<seaborn.axisgrid.FacetGrid at 0x1608b09a0>
month_fraud_df = pd.concat([X_train['month'], y_train], axis=1)
plot = sns.FacetGrid(month_fraud_df, row='is_fraud', height=8, aspect=2.5, xlim=(1,13))
plot.map(sns.histplot, 'month', stat='percent', binwidth=0.999, color='#69BC99')
<seaborn.axisgrid.FacetGrid at 0x1584f1b80>
job_cat = datasets.load_job_cat_dataset()
X_train = pd.merge(X_train, job_cat, how = 'left', on = 'job')
X_test = pd.merge(X_test, job_cat, how = 'left', on = 'job')
X_train.loc[0:5, 'job_cat']
0 research/science 1 education 2 military/government 3 law 4 customer service 5 transport Name: job_cat, dtype: object
plt.figure(figsize=(28,10))
ax = sns.histplot(data=pd.concat([X_train['job_cat'], y_train], axis=1),
x='job_cat', hue='is_fraud', palette={0: '#46A7DC', 1: "#E75858"},
common_norm=False, stat='percent', multiple='dodge', binwidth=1, shrink= 0.5)
utils.set_plot_labels(ax,
('Fraud occurrence on job category', 40),
('job category', 30),
('percent', 30),
(['Fraud', 'No Fraud'], 25))
plt.xticks(size=25, rotation=90)
plt.show()
plt.figure(figsize=(28,10))
ax = sns.histplot(data=pd.concat([X_train['merch_cat'], y_train], axis=1),
x='merch_cat', hue='is_fraud', palette={0: '#46A7DC', 1: "#E75858"},
common_norm=False, stat='percent', multiple='dodge', binwidth=1, shrink= 0.5)
utils.set_plot_labels(ax,
('Fraud occurrence on merchant category', 40),
('merchant category', 30),
('percent', 30),
(['Fraud', 'No Fraud'], 25))
plt.xticks(size=25, rotation=90)
plt.show()
overall_df = pd.concat([X_train, y_train], axis=1)
uniq_fr_cc_num = overall_df.loc[overall_df['is_fraud']==1,:]['cc_num'].unique()
# data frame with a card number of which card was used for fraud at least once
df_w_fraud_card = overall_df.loc[overall_df['cc_num'].isin(uniq_fr_cc_num), :]
# data frame without a card number of which card was used for fraud at least once
df_wo_fraud_card = overall_df.loc[~overall_df['cc_num'].isin(uniq_fr_cc_num), :]
smpld_w_fraud_card = df_w_fraud_card.sample(n=3000, replace=False, random_state=121)
smpld_w_fraud_card.reset_index(inplace=True, drop=True)
smpld_wo_fraud_card = df_wo_fraud_card.sample(n=3000, replace=False, random_state=121)
smpld_wo_fraud_card.reset_index(inplace=True, drop=True)
fraud_map = utils.get_map_with_markers(smpld_w_fraud_card, 'merch_lat', 'merch_long')
no_fraud_map = utils.get_map_with_markers(smpld_wo_fraud_card, 'merch_lat', 'merch_long')
fraud_map
no_fraud_map
amt_threshold = 1700
print(sum(y_train[X_train['amt'] >= amt_threshold])) # 이상치 제거로 인한 사기거래:1 행 제거 갯수 = 0
y_train = y_train[X_train['amt'] < amt_threshold]
X_train = X_train.loc[X_train['amt'] < amt_threshold, :]
X_train.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
0
# 원핫 인코딩
X_train = utils.get_new_df_with_onehot_encoding(X_train, 'merch_cat', True)
X_test = utils.get_new_df_with_onehot_encoding(X_test, 'merch_cat', False)
X_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1295690 entries, 0 to 1295689 Data columns (total 39 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 trans_time 1295690 non-null datetime64[ns] 1 cc_num 1295690 non-null int64 2 merchant 1295690 non-null object 3 amt 1295690 non-null float64 4 first 1295690 non-null object 5 last 1295690 non-null object 6 gender 1295690 non-null int64 7 street 1295690 non-null object 8 city 1295690 non-null object 9 state 1295690 non-null object 10 zip 1295690 non-null int64 11 lat 1295690 non-null float64 12 long 1295690 non-null float64 13 city_pop 1295690 non-null int64 14 job 1295690 non-null object 15 dob 1295690 non-null datetime64[ns] 16 trans_num 1295690 non-null object 17 unix_time 1295690 non-null int64 18 merch_lat 1295690 non-null float64 19 merch_long 1295690 non-null float64 20 hour 1295690 non-null int64 21 day 1295690 non-null int64 22 month 1295690 non-null int64 23 age 1295690 non-null int64 24 job_cat 1295690 non-null object 25 merch_cat_entertainment 1295690 non-null float64 26 merch_cat_food_dining 1295690 non-null float64 27 merch_cat_gas_transport 1295690 non-null float64 28 merch_cat_grocery_net 1295690 non-null float64 29 merch_cat_grocery_pos 1295690 non-null float64 30 merch_cat_health_fitness 1295690 non-null float64 31 merch_cat_home 1295690 non-null float64 32 merch_cat_kids_pets 1295690 non-null float64 33 merch_cat_misc_net 1295690 non-null float64 34 merch_cat_misc_pos 1295690 non-null float64 35 merch_cat_personal_care 1295690 non-null float64 36 merch_cat_shopping_net 1295690 non-null float64 37 merch_cat_shopping_pos 1295690 non-null float64 38 merch_cat_travel 1295690 non-null float64 dtypes: datetime64[ns](2), float64(19), int64(9), object(9) memory usage: 385.5+ MB
unuse_cols = ["trans_time", "cc_num", "merchant", "first", "last", "gender", "street", "city", "state", "zip",
"lat", "long", "city_pop", "job", "dob", "trans_num", "unix_time", "month", "job_cat"]
X_train.drop(columns=unuse_cols, inplace=True)
X_test.drop(columns=unuse_cols, inplace=True)
len(y_train[y_train==1])
7506
len(y_train[y_train==1]) / len(y_train) * 100
0.579305235048507
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_preds = [1 if x > 0.5 else 0 for x in rfc.predict(X_test)]
utils.print_eval_metrics(y_test, y_preds)
precision recall f1-score support
0 1.00 1.00 1.00 553574
1 0.92 0.74 0.82 2145
accuracy 1.00 555719
macro avg 0.96 0.87 0.91 555719
weighted avg 1.00 1.00 1.00 555719
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
y_preds = xgb_model.predict(X_test)
utils.print_eval_metrics(y_test, y_preds)
precision recall f1-score support
0 1.00 1.00 1.00 553574
1 0.91 0.75 0.82 2145
accuracy 1.00 555719
macro avg 0.95 0.88 0.91 555719
weighted avg 1.00 1.00 1.00 555719
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
X_train_under, y_train_under = RandomUnderSampler(random_state=0).fit_resample(X_train, y_train)
rfc = RandomForestClassifier()
rfc.fit(X_train_under, y_train_under)
y_preds = [1 if x > 0.5 else 0 for x in rfc.predict(X_test)]
utils.print_eval_metrics(y_test, y_preds)
precision recall f1-score support
0 1.00 0.98 0.99 553574
1 0.15 0.96 0.26 2145
accuracy 0.98 555719
macro avg 0.57 0.97 0.62 555719
weighted avg 1.00 0.98 0.99 555719
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train_under, y_train_under)
y_preds = xgb_model.predict(X_test)
utils.print_eval_metrics(y_test, y_preds)
precision recall f1-score support
0 1.00 0.98 0.99 553574
1 0.15 0.97 0.26 2145
accuracy 0.98 555719
macro avg 0.57 0.97 0.62 555719
weighted avg 1.00 0.98 0.99 555719
X_train_over, y_train_over = RandomOverSampler(random_state=0).fit_resample(X_train, y_train)
rfc = RandomForestClassifier()
rfc.fit(X_train_over, y_train_over)
y_preds = [1 if x > 0.5 else 0 for x in rfc.predict(X_test)]
utils.print_eval_metrics(y_test, y_preds)
precision recall f1-score support
0 1.00 1.00 1.00 553574
1 0.88 0.77 0.82 2145
accuracy 1.00 555719
macro avg 0.94 0.88 0.91 555719
weighted avg 1.00 1.00 1.00 555719
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train_over, y_train_over)
y_preds = xgb_model.predict(X_test)
utils.print_eval_metrics(y_test, y_preds)
precision recall f1-score support
0 1.00 0.99 1.00 553574
1 0.31 0.93 0.46 2145
accuracy 0.99 555719
macro avg 0.65 0.96 0.73 555719
weighted avg 1.00 0.99 0.99 555719
smote = SMOTE(random_state=0)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
rfc = RandomForestClassifier()
rfc.fit(X_train_smote, y_train_smote)
y_preds = [1 if x>0.5 else 0 for x in rfc.predict(X_test)]
utils.print_eval_metrics(y_test, y_preds)
precision recall f1-score support
0 1.00 1.00 1.00 553574
1 0.67 0.82 0.74 2145
accuracy 1.00 555719
macro avg 0.84 0.91 0.87 555719
weighted avg 1.00 1.00 1.00 555719
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train_smote, y_train_smote)
y_preds = xgb_model.predict(X_test)
utils.print_eval_metrics(y_test, y_preds)
precision recall f1-score support
0 1.00 0.99 1.00 553574
1 0.35 0.90 0.50 2145
accuracy 0.99 555719
macro avg 0.67 0.95 0.75 555719
weighted avg 1.00 0.99 0.99 555719
from sklearn.model_selection import GridSearchCV
xgb_clf= xgb.XGBClassifier()
params={
'n_estimators' : [100,200,300],
'learning_rate' : [0.01,0.1],
'max_depth' : [3,7,10],
'gamma' : [0,1,2],
'colsample_bytree' : [0.8,0.9],
}
grid_search_model = GridSearchCV(xgb_clf, param_grid = params, scoring="recall", n_jobs=-1, verbose = 2)
grid_search_model.fit(X_train_smote, y_train_smote)
Fitting 5 folds for each of 108 candidates, totalling 540 fits
GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
gamma=None, gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None, max_bin=None,
max_cat_to_...
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=None, reg_alpha=None,
reg_lambda=None, ...),
n_jobs=-1,
param_grid={'colsample_bytree': [0.8, 0.9], 'gamma': [0, 1, 2],
'learning_rate': [0.01, 0.1], 'max_depth': [3, 7, 10],
'n_estimators': [100, 200, 300]},
scoring='recall', verbose=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
gamma=None, gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None, max_bin=None,
max_cat_to_...
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=None, reg_alpha=None,
reg_lambda=None, ...),
n_jobs=-1,
param_grid={'colsample_bytree': [0.8, 0.9], 'gamma': [0, 1, 2],
'learning_rate': [0.01, 0.1], 'max_depth': [3, 7, 10],
'n_estimators': [100, 200, 300]},
scoring='recall', verbose=2)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=None,
reg_alpha=None, reg_lambda=None, ...)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=None,
reg_alpha=None, reg_lambda=None, ...)grid_search_model.best_params_
{'colsample_bytree': 0.9,
'gamma': 1,
'learning_rate': 0.1,
'max_depth': 10,
'n_estimators': 300}
grid_search_model.best_score_
0.9996840515317063
xgb_clf= xgb.XGBClassifier()
grid_search_model_under = GridSearchCV(xgb_clf, param_grid = params, scoring="recall", n_jobs=-1, verbose = 2, cv = 4)
grid_search_model_under.fit(X_train_under, y_train_under)
Fitting 4 folds for each of 108 candidates, totalling 432 fits
GridSearchCV(cv=4,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
gamma=None, gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None, max_bin=None,
max_ca...
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=None, reg_alpha=None,
reg_lambda=None, ...),
n_jobs=-1,
param_grid={'colsample_bytree': [0.8, 0.9], 'gamma': [0, 1, 2],
'learning_rate': [0.01, 0.1], 'max_depth': [3, 7, 10],
'n_estimators': [100, 200, 300]},
scoring='recall', verbose=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=4,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
gamma=None, gpu_id=None, grow_policy=None,
importance_type=None,
interaction_constraints=None,
learning_rate=None, max_bin=None,
max_ca...
max_leaves=None, min_child_weight=None,
missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None,
random_state=None, reg_alpha=None,
reg_lambda=None, ...),
n_jobs=-1,
param_grid={'colsample_bytree': [0.8, 0.9], 'gamma': [0, 1, 2],
'learning_rate': [0.01, 0.1], 'max_depth': [3, 7, 10],
'n_estimators': [100, 200, 300]},
scoring='recall', verbose=2)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=None,
reg_alpha=None, reg_lambda=None, ...)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, gamma=None,
gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, n_estimators=100, n_jobs=None,
num_parallel_tree=None, predictor=None, random_state=None,
reg_alpha=None, reg_lambda=None, ...)print(grid_search_model_under.best_params_)
print(grid_search_model_under.best_score_)
{'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100}
0.9773505985939092
xgb__opt_model = xgb.XGBClassifier(colsample_bytree=0.9, gamma=1, learning_rate=0.1, max_depth=10, n_estimators=300)
xgb__opt_model.fit(X_train_smote, y_train_smote)
y_preds = xgb__opt_model.predict(X_test)
utils.print_eval_metrics(y_test, y_preds)
precision recall f1-score support
0 1.00 1.00 1.00 553574
1 0.52 0.85 0.65 2145
accuracy 1.00 555719
macro avg 0.76 0.92 0.82 555719
weighted avg 1.00 1.00 1.00 555719
X_train_under, y_train_under = RandomUnderSampler(random_state=0).fit_resample(X_train, y_train)
xgb__opt_model_under = xgb.XGBClassifier(colsample_bytree=0.9, gamma=0, learning_rate=0.1, max_depth=10, n_estimators=100)
xgb__opt_model_under.fit(X_train_under, y_train_under)
y_preds = xgb__opt_model_under.predict(X_test)
utils.print_eval_metrics(y_test, y_preds)
precision recall f1-score support
0 1.00 0.98 0.99 553574
1 0.15 0.97 0.26 2145
accuracy 0.98 555719
macro avg 0.57 0.98 0.62 555719
weighted avg 1.00 0.98 0.99 555719
utils.draw_feature_importance_plot(np.array(X_train_under.columns), np.array(xgb__opt_model_under.feature_importances_))